Raw data quality control
Initialization
We start the analysis by initializing the packages required for all the analysis performed in this section. We also define the root directory, within which all the input/output operations for this project will be performed. At the end of this document, detailed software version information is provided for easier reproducibility of the analysis.
library(reshape2)
library(ggplot2)
library(ggrepel)
library(ggpubr)
library(lsr)
library(plotly)
library(DT)
library(WriteXLS)
library(ggallin)
path = "/Users/ashwin/Documents/Projects/YeastScreen/Essential_DAmP_screen/"Raw data summary
We show below the summary for the raw screening data - Yeast Eessential DaMP library.
rawDat = readRDS(paste0(path, "data/workspaces/YeastRedox_DAmP_RawData.RDS"))
rawDat$Signal_485_520 = log10(rawDat$Signal_485_520)
rawDat$Signal_400_520 = log10(rawDat$Signal_400_520)
summary(rawDat) Plate Well_96 Well Group
2 : 3888 A11 : 576 A41 : 36 AL : 576
4 : 3840 D10 : 576 A42 : 36 AT : 576
3 : 3792 D2 : 576 A43 : 36 K : 576
9 : 3696 A1 : 528 A44 : 36 A : 528
8 : 3360 D1 : 528 B41 : 36 AK : 528
5 : 3216 D5 : 528 B42 : 36 AO : 528
(Other):16032 (Other):34512 (Other):37608 (Other):34512
Content SystematicName SGD.ID Gene.Symbol
Blank :9456 Length:37824 Length:37824 Length:37824
Control :9456 Class :character Class :character Class :character
Cytoplasm :9456 Mode :character Mode :character Mode :character
Mitochondria:9456
roGFP2.ratio Signal_485_520 Signal_400_520 Type
Min. :-9.746 Min. :3.510 Min. :4.626 Glucose :12608
1st Qu.: 0.469 1st Qu.:4.379 1st Qu.:4.963 Galactose:12608
Median : 0.568 Median :4.980 Median :5.088 Glycerol :12608
Mean : 0.648 Mean :4.768 Mean :5.066
3rd Qu.: 0.817 3rd Qu.:5.104 3rd Qu.:5.172
Max. : 6.500 Max. :5.415 Max. :5.415
NA's :9456
Background Fluorescence
Firstly, we look at the distribution of background flourescence across all plates and retain only those colonies for Control, Mitochondria and Cytoplasm whose fluorescence signal exceeds the 90th percentile of blank 485-520 and 400-520 fluorescence signals
tmpdat = rawDat[, c("Plate", "Content", "Signal_485_520", "Signal_400_520", "Type")]
tmpdat = melt(tmpdat)
a = droplevels(tmpdat[tmpdat$variable == "Signal_485_520" & tmpdat$Content == "Blank",
])
per90_485_520 = lapply(split(a, a$Plate), function(x) split(x, x$Type))
per90_485_520 = sapply(per90_485_520, function(x) sapply(x, function(y) quantile(y$value,
0.9)))
per90_485_520 = melt(t(per90_485_520))
per90_485_520$Var2 = gsub(".90%", "", per90_485_520$Var2)
colnames(per90_485_520) = c("Plate", "Type", "value")
a = droplevels(tmpdat[tmpdat$variable == "Signal_400_520" & tmpdat$Content == "Blank",
])
per90_400_520 = lapply(split(a, a$Plate), function(x) split(x, x$Type))
per90_400_520 = sapply(per90_400_520, function(x) sapply(x, function(y) quantile(y$value,
0.9)))
per90_400_520 = melt(t(per90_400_520))
per90_400_520$Var2 = gsub(".90%", "", per90_400_520$Var2)
colnames(per90_400_520) = c("Plate", "Type", "value")
# Filtering out colonies
if (identical(paste0(per90_400_520$Plate, per90_400_520$Type), paste0(per90_485_520$Plate,
per90_485_520$Type))) {
cutoff = data.frame(per90_400_520[, 1:2], per90_400_520 = per90_400_520$value,
per90_485_520 = per90_485_520$value, stringsAsFactors = F)
}
rawDatFilt = lapply(split(rawDat, rawDat$Plate), function(x) split(x, x$Type))
for (i in 1:nrow(cutoff)) {
x = rawDatFilt[[as.character(cutoff$Plate[i])]][[cutoff$Type[i]]]
x = x[x$Signal_485_520 > cutoff$per90_485_520[i] & x$Signal_400_520 > cutoff$per90_400_520[i],
]
rawDatFilt[[as.character(cutoff$Plate[i])]][[cutoff$Type[i]]] = x
}
rawDatFilt = do.call("rbind", lapply(rawDatFilt, function(x) do.call("rbind", x)))
# Plot background fluorescence distributions
p1 = ggplot(tmpdat, aes(x = value, fill = Content)) + geom_density(alpha = 0.5) +
theme_bw(base_size = 8) + labs(subtitle = "Essential DAmP screen", x = "Fluorescence signal (in log10)",
y = "Density", fill = "") + facet_wrap(variable ~ Type, scales = "free", ncol = 3) +
theme(legend.position = "right", panel.grid = element_blank())
p2 = ggplot(tmpdat[tmpdat$variable == "Signal_485_520", ], aes(x = value, fill = Content)) +
geom_density(alpha = 0.5) + theme_bw(base_size = 8) + labs(subtitle = "Essential DAmP screen | Signal_485_520",
x = "Fluorescence signal (in log10)", y = "Density", fill = "") + geom_vline(data = per90_485_520,
aes(xintercept = value)) + facet_wrap(Plate ~ Type, scales = "free", ncol = 3) +
theme(legend.position = "right", panel.grid = element_blank())
p3 = ggplot(tmpdat[tmpdat$variable == "Signal_400_520", ], aes(x = value, fill = Content)) +
geom_density(alpha = 0.5) + theme_bw(base_size = 8) + labs(subtitle = "Essential DAmP screen | Signal_400_520",
x = "Fluorescence signal (in log10)", y = "Density", fill = "") + geom_vline(data = per90_400_520,
aes(xintercept = value)) + facet_wrap(Plate ~ Type, scales = "free", ncol = 3) +
theme(legend.position = "right", panel.grid = element_blank())
ggsave(filename = paste0(path, "analysis/QC/background_fluorescence.pdf"), plot = p1,
width = 7, height = 3.5)
ggsave(filename = paste0(path, "analysis/QC/background_fluorescence_Signal_485_520.pdf"),
plot = p2, width = 7, height = 15, limitsize = F)
ggsave(filename = paste0(path, "analysis/QC/background_fluorescence_Signal_400_520.pdf"),
plot = p3, width = 7, height = 15, limitsize = F)
p1[1] "total number of colonies in the raw data - 37824"
paste0("total number of colonies in the background flourescence filtered data - ",
nrow(rawDatFilt))[1] "total number of colonies in the background flourescence filtered data - 27541"
[1] "% of colonies KEPT - 72.81"
Cleaning raw data
Firstly we read the raw yeast redox screen data and remove outlier (extreme values) and NA. Typically, outlier removal is not considered good data analysis practice, however, we do it here because we know that the extreme values comes from technical artifacts. For example, roGFP2 ratios cannot be negative and those which are significantly higher than the plate average are ambiguous signals.
Outliers were detected as -
- upper outlier values > 75th quantile value for a plate + 3 * plate IQR value
- lower outlier values < 25th quantile value for a plate - 3 * plate IQR value
The outlier values are all set to NA and values greater than the lower bound threshold but less than 0 were set to pseudo minimum value of 0.0001 as roGFP2 ratios can’t be less than 0.
We show these extreme outliers below, ploting the distribution of values from each plate.
tmpdat = na.omit(rawDat)
ggplot(tmpdat, aes(x = roGFP2.ratio, color = Plate)) + geom_density() + theme_bw(base_size = 8) +
labs(x = "roGFP2 ratio (in log10)") + scale_x_continuous(trans = pseudolog10_trans,
breaks = c(-50, -2:5, 50)) + facet_wrap(Content ~ Type, scales = "free") + theme(legend.position = "none",
panel.grid = element_blank())rawDatCleaned = vector("list", length = 3)
names(rawDatCleaned) = c("Glucose", "Galactose", "Glycerol")
for (i in names(rawDatCleaned)) {
dat = rawDat[rawDat$Type == i, ]
dat = split(dat, dat$Plate)
dat = lapply(dat, function(x) {
vals = x$roGFP2.ratio
lb = quantile(vals, probs = 0.25, na.rm = T) - (3 * IQR(vals, na.rm = T))
ub = quantile(vals, probs = 0.75, na.rm = T) + (3 * IQR(vals, na.rm = T))
vals[vals < lb | vals > ub] = NA
vals[vals > lb & vals < 0] = 10^-4
x$roGFP2.ratio = vals
x = x[!is.na(x$roGFP2.ratio), ]
return(x)
})
rawDatCleaned[[i]] = do.call("rbind", dat)
rm(dat)
}
rawDatCleaned = do.call("rbind", rawDatCleaned)
rawDatCleaned = droplevels(rawDatCleaned)
rownames(rawDatCleaned) = 1:nrow(rawDatCleaned)Next, lets check the summaries of the raw roGFP2 redox screen data and the cleaned raw data for comparison.
Raw data looked like -
Plate Well_96 Well Group
2 : 2892 D10 : 451 A41 : 36 AT : 451
4 : 2690 A11 : 447 A42 : 36 K : 447
9 : 2682 H8 : 413 A43 : 36 CN : 413
3 : 2595 G9 : 405 A44 : 36 CC : 405
5 : 2462 D7 : 402 B41 : 36 AQ : 402
8 : 2438 D12 : 394 B42 : 36 AV : 394
(Other):11782 (Other):25029 (Other):27325 (Other):25029
Content SystematicName SGD.ID Gene.Symbol
Blank : 812 Length:27541 Length:27541 Length:27541
Control :8671 Class :character Class :character Class :character
Cytoplasm :8826 Mode :character Mode :character Mode :character
Mitochondria:9232
roGFP2.ratio Signal_485_520 Signal_400_520 Type
Min. :-2.4333 Min. :3.922 Min. :4.820 Glucose :9347
1st Qu.: 0.4760 1st Qu.:4.949 1st Qu.:5.071 Galactose:8557
Median : 0.5752 Median :5.050 Median :5.134 Glycerol :9637
Mean : 0.6558 Mean :5.020 Mean :5.137
3rd Qu.: 0.8239 3rd Qu.:5.138 3rd Qu.:5.201
Max. : 3.1356 Max. :5.415 Max. :5.415
NA's :812
Cleaned data looked like -
Plate Well_96 Well Group
2 : 2807 A11 : 432 A41 : 36 AT : 432
9 : 2602 D10 : 432 A42 : 36 K : 432
4 : 2601 D7 : 396 A43 : 36 AQ : 396
3 : 2501 G9 : 396 A44 : 36 CC : 396
5 : 2396 D5 : 382 B41 : 36 AO : 382
8 : 2363 E6 : 365 B42 : 36 BB : 365
(Other):11443 (Other):24310 (Other):26497 (Other):24310
Content SystematicName SGD.ID Gene.Symbol
Control :8670 Length:26713 Length:26713 Length:26713
Cytoplasm :8825 Class :character Class :character Class :character
Mitochondria:9218 Mode :character Mode :character Mode :character
roGFP2.ratio Signal_485_520 Signal_400_520 Type
Min. :0.0001 Min. :4.449 Min. :4.831 Glucose :9059
1st Qu.:0.4760 1st Qu.:4.959 1st Qu.:5.076 Galactose:8287
Median :0.5751 Median :5.055 Median :5.137 Glycerol :9367
Mean :0.6553 Mean :5.045 Mean :5.141
3rd Qu.:0.8237 3rd Qu.:5.141 3rd Qu.:5.204
Max. :2.2561 Max. :5.415 Max. :5.415
Percentage of data removed after outlier removal -
[1] 3.01
Distribution of raw roGFP2 ratios
Distribution of raw roGFP2 ratios (absolute ratios) per plate
p = ggplot(rawDatCleaned) + theme_bw(base_size = 9) + geom_boxplot(aes(x = Plate,
y = roGFP2.ratio), outlier.size = 0.1) + facet_grid(Content ~ Type, scales = "free") +
theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5))
ggsave(filename = paste0(path, "analysis/QC/roGFP2_ratio_RAWdata_distribution_nutrient_compartments.pdf"),
plot = p, width = 12, height = 5)
pIdentification of edge effects
A well known technical issue with yeast mutant screens is the altered growth characteristics of yeast colonies growing at the edge of plates. Next, we look at how roGFP2 ratios are different between edge wells and non-edge wells.
These are the edge wells and the mutants originating from these locations should be carefully analyzed.
edge = as.character(unlist(read.csv(paste0(path, "data/annotations/edge_wells.txt"))))
# Print edge wells
edge [1] "I01" "Q01" "Y01" "A03" "D02" "F04" "I03" "L02" "N04" "Q03" "T02" "V04"
[13] "Y03" "b02" "d04" "A09" "A17" "A25" "A33" "A41" "B05" "B13" "B21" "B29"
[25] "B37" "B45" "C09" "C17" "C25" "C33" "C41" "D05" "D13" "D21" "D29" "D37"
[37] "D45" "F45" "H45" "J45" "L45" "N45" "P45" "R45" "T45" "V45" "X45" "Z45"
[49] "b05" "b13" "b21" "b29" "b37" "b45" "c09" "c17" "c25" "c33" "c41" "d05"
[61] "d13" "d21" "d29" "d37" "d45" "e09" "e17" "e25" "e33" "e41" "J01" "R01"
[73] "Z01" "A04" "D03" "G02" "I04" "L03" "O02" "Q04" "T03" "W02" "Y04" "b03"
[85] "e02" "A10" "A18" "A26" "A34" "A42" "B06" "B14" "B22" "B30" "B38" "B46"
[97] "C10" "C18" "C26" "C34" "C42" "D06" "D14" "D22" "D30" "D38" "D46" "F46"
[109] "H46" "J46" "L46" "N46" "P46" "R46" "T46" "V46" "X46" "Z46" "b06" "b14"
[121] "b22" "b30" "b38" "b46" "c10" "c18" "c26" "c34" "c42" "d06" "d14" "d22"
[133] "d30" "d38" "d46" "e10" "e18" "e26" "e34" "e42" "K01" "S01" "a01" "B02"
[145] "D04" "G03" "J02" "L04" "O03" "R02" "T04" "W03" "Z02" "b04" "e03" "A11"
[157] "A19" "A27" "A35" "A43" "B07" "B15" "B23" "B31" "B39" "B47" "C11" "C19"
[169] "C27" "C35" "C43" "D07" "D15" "D23" "D31" "D39" "D47" "F47" "H47" "J47"
[181] "L47" "N47" "P47" "R47" "T47" "V47" "X47" "Z47" "b07" "b15" "b23" "b31"
[193] "b39" "b47" "c11" "c19" "c27" "c35" "c43" "d07" "d15" "d23" "d31" "d39"
[205] "d47" "e11" "e19" "e27" "e35" "e43" "L01" "T01" "b01" "B03" "E02" "G04"
[217] "J03" "M02" "O04" "R03" "U02" "W04" "Z03" "c02" "e04" "A12" "A20" "A28"
[229] "A36" "A44" "B08" "B16" "B24" "B32" "B40" "B48" "C12" "C20" "C28" "C36"
[241] "C44" "D08" "D16" "D24" "D32" "D40" "D48" "F48" "H48" "J48" "L48" "N48"
[253] "P48" "R48" "T48" "V48" "X48" "Z48" "b08" "b16" "b24" "b32" "b40" "b48"
[265] "c12" "c20" "c28" "c36" "c44" "d08" "d16" "d24" "d32" "d40" "d48" "e12"
[277] "e20" "e28" "e36" "e44" "M01" "U01" "c01" "B04" "E03" "H02" "J04" "M03"
[289] "P02" "R04" "U03" "X02" "Z04" "c03" "A05" "A13" "A21" "A29" "A37" "A45"
[301] "B09" "B17" "B25" "B33" "B41" "C05" "C13" "C21" "C29" "C37" "C45" "D09"
[313] "D17" "D25" "D33" "D41" "E45" "G45" "I45" "K45" "M45" "O45" "Q45" "S45"
[325] "U45" "W45" "Y45" "a45" "b09" "b17" "b25" "b33" "b41" "c05" "c13" "c21"
[337] "c29" "c37" "c45" "d09" "d17" "d25" "d33" "d41" "e05" "e13" "e21" "e29"
[349] "e37" "e45" "N01" "V01" "d01" "C02" "E04" "H03" "K02" "M04" "P03" "S02"
[361] "U04" "X03" "a02" "c04" "A06" "A14" "A22" "A30" "A38" "A46" "B10" "B18"
[373] "B26" "B34" "B42" "C06" "C14" "C22" "C30" "C38" "C46" "D10" "D18" "D26"
[385] "D34" "D42" "E46" "G46" "I46" "K46" "M46" "O46" "Q46" "S46" "U46" "W46"
[397] "Y46" "a46" "b10" "b18" "b26" "b34" "b42" "c06" "c14" "c22" "c30" "c38"
[409] "c46" "d10" "d18" "d26" "d34" "d42" "e06" "e14" "e22" "e30" "e38" "e46"
[421] "O01" "W01" "e01" "C03" "F02" "H04" "K03" "N02" "P04" "S03" "V02" "X04"
[433] "a03" "d02" "A07" "A15" "A23" "A31" "A39" "A47" "B11" "B19" "B27" "B35"
[445] "B43" "C07" "C15" "C23" "C31" "C39" "C47" "D11" "D19" "D27" "D35" "D43"
[457] "D47" "G47" "I47" "K47" "M47" "O47" "Q47" "S47" "U47" "W47" "Y47" "a47"
[469] "b11" "b19" "b27" "b35" "b43" "c07" "c15" "c23" "c31" "c39" "c47" "d11"
[481] "d19" "d27" "d35" "d43" "e07" "e15" "e23" "e31" "e39" "e47" "P01" "X01"
[493] "A02" "C04" "F03" "I02" "K04" "N03" "Q02" "S04" "V03" "Y02" "a04" "d03"
[505] "A08" "A16" "A24" "A32" "A40" "A48" "B12" "B20" "B28" "B36" "B44" "C08"
[517] "C16" "C24" "C32" "C40" "C48" "D12" "D20" "D28" "D36" "D44" "D48" "G48"
[529] "I48" "K48" "M48" "O48" "Q48" "S48" "U48" "W48" "Y48" "a48" "b12" "b20"
[541] "b28" "b36" "b44" "c08" "c16" "c24" "c32" "c40" "c48" "d12" "d20" "d28"
[553] "d36" "d44" "e08" "e16" "e24" "e32" "e40" "e48"
edgeDat = rep("non edge wells", nrow(rawDatCleaned))
edgeDat[rawDatCleaned$Well %in% edge] = "edge wells"
edge = rawDatCleaned
edge$edge = edgeDat
rm(edgeDat)Visulaizing the differences using both effect size (Fold change) and pvalue.
For none of the plates and conditions, the edge effects exceed the log fold change of +/- 1
# Computing distribution difference statistics
stats = split(edge, paste(edge$Plate, edge$Type, edge$Content, sep = "-"))
stats = t(sapply(stats, function(x) {
y = x$roGFP2.ratio[x$edge == "edge wells"]
n = x$roGFP2.ratio[x$edge == "non edge wells"]
fc = log2(median(y, na.rm = T)/median(n, na.rm = T))
pv = wilcox.test(y, n)$p.value
# cd = effectsize::cohens_d(y, n, pooled_sd = F)$Cohens_d p = signif(p,2)
# return(c(log2fc = fc, wilcoxP = pv, CohenD = cd))
return(c(log2fc = fc, wilcoxP = pv))
}))
a = do.call("rbind", strsplit(rownames(stats), "-"))
colnames(a) = c("Plate", "Nutrient", "Condition")
stats = data.frame(a, stats, row.names = NULL, stringsAsFactors = F)
stats$log2fc = round(stats$log2fc, 2)
stats$wilcoxP = signif(-log10(p.adjust(stats$wilcoxP, method = "fdr")), 2)
# stats$CohenD = round(stats$CohenD, 2)
# Volcano plot
p3 = ggplot(stats, aes(x = log2fc, y = wilcoxP, text = Plate)) + theme_bw(base_size = 8) +
xlim(-1, 1) + labs(y = "-log10(fdr pval)") + geom_point(size = 0.8) + geom_hline(yintercept = -log10(0.05)) +
# geom_vline(xintercept = c(-0.5, 0.5), lty = 2) +
geom_vline(xintercept = c(-1, 1), lty = 1) + facet_wrap(Nutrient ~ Condition, scales = "free") +
geom_text_repel(data = stats[abs(stats$log2fc) > 0.5 & stats$wilcoxP > -log10(0.05),
], aes(label = Plate), size = 2) + theme(panel.grid = element_blank())
ggsave(filename = paste0(path, "analysis/QC/roGFP2_ratio_RAWdata_edge_effects_badPlates.pdf"),
plot = p3, width = 7.5, height = 6)
ggplotly(p3)Variance among controls
Next we look into the variance exhibited by the controls across plates and nutrient conditions. Ideally, all controls in the plate should have similar values i.e variance close to zero. Below we show the variance of raw roGFP2 ratios among controls per plate and group.
ctrlVar = rawDatCleaned %>% filter(Content == "Control") %>% select(Gene.Symbol,
Plate, Type, Group, roGFP2.ratio) %>% group_by(Plate, Type, Group) %>% mutate(Control_variance = round(var(roGFP2.ratio,
na.rm = T), 4)) %>% ungroup() %>% select(Gene.Symbol, Plate, Type, Group, Control_variance) %>%
rename(Genes = Gene.Symbol, Nutrient = Type) %>% mutate(Plate = paste0("P-",
Plate)) %>% distinct()
plot_ly(ctrlVar, x = ~Group, y = ~Plate, z = ~Control_variance, text = ~Genes, type = "scatter3d",
mode = "markers", marker = list(size = 3), opacity = 1, color = ~Nutrient)Next we remove all genes with variance among controls > 0.05
rmv = ctrlVar[which(ctrlVar$Control_variance > 0.05), ]
rmv$Plate = gsub("P-", "", rmv$Plate)
rmv = paste(rmv$Genes, rmv$Plate, rmv$Nutrient, sep = "-")
paste0("Number of poor control genes removed - ", length(rmv))[1] "Number of poor control genes removed - 3"
matchID = paste(rawDatCleaned$Gene.Symbol, rawDatCleaned$Plate, rawDatCleaned$Type,
sep = "-")
rawDatCleaned_ctrlfilt = rawDatCleaned[!matchID %in% rmv, ]
ctrlVar = rawDatCleaned_ctrlfilt %>% filter(Content == "Control") %>% select(Gene.Symbol,
Plate, Type, Group, roGFP2.ratio) %>% group_by(Plate, Type, Group) %>% mutate(Control_variance = round(var(roGFP2.ratio,
na.rm = T), 4)) %>% ungroup() %>% select(Gene.Symbol, Plate, Type, Group, Control_variance) %>%
rename(Genes = Gene.Symbol, Nutrient = Type) %>% mutate(Plate = paste0("P-",
Plate)) %>% distinct()
plot_ly(ctrlVar, x = ~Group, y = ~Plate, z = ~Control_variance, text = ~Genes, type = "scatter3d",
mode = "markers", marker = list(size = 3), opacity = 1, color = ~Nutrient)Next we we visualize the fact that in a good screen, individual control values should not deviate much from the median value of all controls from that same plate. For a given plate \(i\), the normalized control values i.e \(NormCtrl\) is given as - \[NormCtrl_i = \frac{Ctrl_i} {median(Ctrl_i)}\] We plot these scaled control values per plate across conditions below. Ideally these values should be close to 1 (shown as red horizontal lines). A table for these scaled control values is also provided.
ctrlDat = split(rawDatCleaned_ctrlfilt, rawDatCleaned_ctrlfilt$Type)
ctrlDat = lapply(ctrlDat, function(x) {
split(x, x$Plate)
})
for (i in 1:length(ctrlDat)) {
for (j in 1:length(ctrlDat[[i]])) {
tmp = droplevels(ctrlDat[[i]][[j]])
tmp = tmp[, -2]
# Plate control
tmp.ctrl = tmp[tmp$Content == "Control", ]
# Normalizing factor
nf = median(tmp.ctrl$roGFP2.ratio, na.rm = T)
# Measure control value deviation from plate median
ctrlDat[[i]][[j]] = data.frame(Gene = tmp.ctrl$Gene.Symbol, NormControl = round(log2(tmp.ctrl$roGFP2.ratio/nf),
3), stringsAsFactors = F)
# Deleting
rm(tmp, tmp.ctrl, nf)
}
rm(j)
}
rm(i)
ctrlDat = melt(ctrlDat, id = "Gene")
ctrlDat = ctrlDat[, c(4, 5, 1, 3)]
colnames(ctrlDat) = c("Plate", "Nutrient", "Gene", "NormControl")
ctrlDat$Plate = as.factor(ctrlDat$Plate)
ctrlDat$Nutrient = factor(ctrlDat$Nutrient, levels = c("Glucose", "Galactose", "Glycerol"))
# datatable(ctrlDat, rownames = FALSE, filter='top', class='compact', extensions
# = c('Buttons') , options = list(autoWidth = TRUE, dom = 'Blfrtip', buttons =
# c('csv', 'excel') ))
p = ggplot(ctrlDat, aes(x = Plate, y = NormControl)) + theme_bw(base_size = 8) +
labs(y = "Control / Median plate control (roGFP2 ratios) in log2 scale") + geom_boxplot(outlier.size = 0.1,
lwd = 0.2) + facet_wrap(~Nutrient) + geom_hline(yintercept = 3, color = "orange") +
geom_hline(yintercept = -3, color = "orange") + # geom_text_repel(data = subset(ctrlDat, NormControl > 3), aes(x = Plate, y =
# NormControl, label = Gene), size = 2) +
theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5))
pNext we remove those controls that deviate 3 log fold units over the plate control median value
rmv = ctrlDat[which(ctrlDat$NormControl > 3 | ctrlDat$NormControl < -3), ]
rmv = paste(rmv$Gene, rmv$Plate, rmv$Nutrient, sep = "-")
ctrlDat = ctrlDat[!(ctrlDat$NormControl > 3 | ctrlDat$NormControl < -3), ]
paste0("Number of control genes removed which deviated 3 log fold units from the plate median - ",
length(rmv))[1] "Number of control genes removed which deviated 3 log fold units from the plate median - 14"
matchID = paste(rawDatCleaned_ctrlfilt$Gene.Symbol, rawDatCleaned_ctrlfilt$Plate,
rawDatCleaned_ctrlfilt$Type, sep = "-")
rawDatCleaned_ctrlfilt = rawDatCleaned_ctrlfilt[!matchID %in% rmv, ]Plotting these scaled control values -
p = ggplot(ctrlDat) + theme_bw(base_size = 8) + labs(y = "Control / Median plate control (roGFP2 ratios) in log2 scale") +
geom_boxplot(aes(x = Plate, y = NormControl), outlier.size = 0.1, lwd = 0.2) +
facet_wrap(~Nutrient) + # geom_text_repel(data = subset(ctrlDat, NormControl > 3), aes(x = Plate, y =
# NormControl, label = Gene), size = 2) +
theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5))
pA more standard way of calculating the variance among the controls is by computing the robust coefficient of variance see here. For a given plate \(i\), this is given as - \[CoefVar_i = \frac{mad(Ctrl_i)}{median(Ctrl_i)}\] mad is Median absolute deviation
Plotting these robust coefficient of variation for control values-
ctrlDat$NormControl = 2^ctrlDat$NormControl
cvDat = split(ctrlDat, ctrlDat$Nutrient)
cvDat = lapply(cvDat, function(x) {
a = split(x, x$Plate)
b = lapply(a, function(y) mad(y$NormControl)/median(y$NormControl))
return(b)
})
cvDat = melt(cvDat)
cvDat = cvDat[, c(2, 3, 1)]
colnames(cvDat) = c("Plate", "Nutrient", "Dev")
cvDat$Plate = as.factor(cvDat$Plate)
cvDat$Nutrient = factor(cvDat$Nutrient, levels = c("Glucose", "Galactose", "Glycerol"))
p = ggplot(cvDat, aes(Plate, Dev)) + theme_bw(base_size = 8) + labs(y = "% of deviation from median plate control roGFP2 ratios") +
geom_bar(stat = "identity") + facet_wrap(~Nutrient) + # geom_hline(yintercept = 0.5, color = 'black') +
geom_hline(yintercept = 1, color = "black") + theme(panel.grid = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5))
ggsave(filename = paste0(path, "analysis/QC/roGFP2_ratio_RAWdata_Controls_Coef_variation.pdf"),
plot = p, width = 7, height = 3.5)
pSaving the data
Finally we save the cleaned raw Data as a .RDS (R data object), which will be used for the next step of normalization
Session information
R version 3.6.2 (2019-12-12)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS 10.16
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] ggallin_0.1.1 WriteXLS_5.0.0 DT_0.12 plotly_4.9.2
[5] lsr_0.5 ggpubr_0.2.4 magrittr_1.5 ggrepel_0.8.1
[9] ggplot2_3.2.1 reshape2_1.4.3 rmdformats_0.3.6 knitr_1.28
loaded via a namespace (and not attached):
[1] tidyselect_1.0.0 xfun_0.12 purrr_0.3.3 colorspace_1.4-1
[5] vctrs_0.2.2 htmltools_0.4.0 viridisLite_0.3.0 yaml_2.2.1
[9] rlang_0.4.4 pillar_1.4.3 later_1.0.0 glue_1.3.1
[13] withr_2.1.2 RColorBrewer_1.1-2 lifecycle_0.1.0 plyr_1.8.5
[17] stringr_1.4.0 munsell_0.5.0 ggsignif_0.6.0 gtable_0.3.0
[21] htmlwidgets_1.5.1 evaluate_0.14 labeling_0.3 fastmap_1.0.1
[25] Cairo_1.5-11 httpuv_1.5.5 crosstalk_1.0.0 Rcpp_1.0.3
[29] xtable_1.8-4 promises_1.1.0 scales_1.1.0 formatR_1.7
[33] jsonlite_1.6.1 mime_0.9 farver_2.0.3 digest_0.6.23
[37] stringi_1.4.5 bookdown_0.17 dplyr_0.8.4 shiny_1.4.0
[41] grid_3.6.2 tools_3.6.2 lazyeval_0.2.2 tibble_2.1.3
[45] crayon_1.3.4 tidyr_1.0.2 pkgconfig_2.0.3 ellipsis_0.3.0
[49] data.table_1.12.8 assertthat_0.2.1 rmarkdown_2.1 httr_1.4.1
[53] R6_2.4.1 compiler_3.6.2